package cc.mallet.topics.gui;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import cc.mallet.pipe.CharSequence2TokenSequence;
import cc.mallet.pipe.CharSequenceRemoveHTML;
import cc.mallet.pipe.CharSubsequence;
import cc.mallet.pipe.FeatureSequence2AugmentableFeatureVector;
import cc.mallet.pipe.Input2CharSequence;
import cc.mallet.pipe.Pipe;
import cc.mallet.pipe.PrintInputAndTarget;
import cc.mallet.pipe.SaveDataInSource;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.Target2Label;
import cc.mallet.pipe.TokenSequence2FeatureSequence;
import cc.mallet.pipe.TokenSequence2FeatureSequenceWithBigrams;
import cc.mallet.pipe.TokenSequenceLowercase;
import cc.mallet.pipe.TokenSequenceNGrams;
import cc.mallet.pipe.TokenSequenceRemoveNonAlpha;
import cc.mallet.pipe.TokenSequenceRemoveStopwords;
import cc.mallet.pipe.iterator.FileIterator;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CharSequenceLexer;
import cc.mallet.util.CommandOption;
import cc.mallet.util.MalletLogger;
import cc.mallet.util.Strings;

public class EnronEmail2Vectors {
	private static Logger logger = MalletLogger
			.getLogger(EnronEmail2Vectors.class.getName());

	static CommandOption.SpacedStrings classDirs = new CommandOption.SpacedStrings(
			EnronEmail2Vectors.class,
			"input",
			"DIR...",
			true,
			null,
			"The directories containing text files to be classified, one directory per class",
			null);

	static CommandOption.File outputFile = new CommandOption.File(
			EnronEmail2Vectors.class, "output", "FILE", true, new File(
					"text.vectors"),
			"Write the instance list to this file; Using - indicates stdout.",
			null);

	static CommandOption.File usePipeFromVectorsFile = new CommandOption.File(
			EnronEmail2Vectors.class,
			"use-pipe-from",
			"FILE",
			true,
			new File("text.vectors"),
			"Use the pipe and alphabets from a previously created vectors file. "
					+ "Allows the creation, for example, of a test set of vectors that are "
					+ "compatible with a previously created set of training vectors",
			null);

	static CommandOption.Boolean preserveCase = new CommandOption.Boolean(
			EnronEmail2Vectors.class, "preserve-case", "[TRUE|FALSE]", false,
			false, "If true, do not force all strings to lowercase.", null);

	static CommandOption.Boolean enronDataset = new CommandOption.Boolean(
			EnronEmail2Vectors.class,
			"enron-dataset",
			"[TRUE|FALSE]",
			false,
			false,
			"If true, specifically pre-process the Enron dataset input to remove email header.",
			null);

	static CommandOption.Boolean removeStopWords = new CommandOption.Boolean(
			EnronEmail2Vectors.class,
			"remove-stopwords",
			"[TRUE|FALSE]",
			false,
			false,
			"If true, remove a default list of common English \"stop words\" from the text.",
			null);

	static CommandOption.File stoplistFile = new CommandOption.File(
			EnronEmail2Vectors.class,
			"stoplist-file",
			"FILE",
			true,
			null,
			"Instead of the default list, read stop words from a file, one per line. Implies --remove-stopwords",
			null);

	static CommandOption.File extraStopwordsFile = new CommandOption.File(
			EnronEmail2Vectors.class,
			"extra-stopwords",
			"FILE",
			true,
			null,
			"Read whitespace-separated words from this file, and add them to either\n"
					+ "   the default English stoplist or the list specified by --stoplist-file.",
			null);

	static CommandOption.Boolean skipHeader = new CommandOption.Boolean(
			EnronEmail2Vectors.class, "skip-header", "[TRUE|FALSE]", false,
			false,
			"If true, in each document, remove text occurring before a blank line."
					+ "  This is useful for removing email or UseNet headers",
			null);

	static CommandOption.Boolean skipHtml = new CommandOption.Boolean(
			EnronEmail2Vectors.class, "skip-html", "[TRUE|FALSE]", false,
			false,
			"If true, remove text occurring inside <...>, as in HTML or SGML.",
			null);

	static CommandOption.Boolean binaryFeatures = new CommandOption.Boolean(
			EnronEmail2Vectors.class, "binary-features", "[TRUE|FALSE]", false,
			false, "If true, features will be binary.", null);

	static CommandOption.IntegerArray gramSizes = new CommandOption.IntegerArray(
			EnronEmail2Vectors.class,
			"gram-sizes",
			"INTEGER,[INTEGER,...]",
			true,
			new int[] { 1 },
			"Include among the features all n-grams of sizes specified.  "
					+ "For example, to get all unigrams and bigrams, use --gram-sizes 1,2.  "
					+ "This option occurs after the removal of stop words, if removed.",
			null);

	static CommandOption.Boolean keepSequence = new CommandOption.Boolean(
			EnronEmail2Vectors.class,
			"keep-sequence",
			"[TRUE|FALSE]",
			false,
			false,
			"If true, final data will be a FeatureSequence rather than a FeatureVector.",
			null);

	static CommandOption.Boolean stemming = new CommandOption.Boolean(
			EnronEmail2Vectors.class, "stemming", "[TRUE|FALSE]", false, false,
			"If true, perform stemming on the input tokens.", null);

	static CommandOption.Boolean keepSequenceBigrams = new CommandOption.Boolean(
			EnronEmail2Vectors.class,
			"keep-sequence-bigrams",
			"[TRUE|FALSE]",
			false,
			false,
			"If true, final data will be a FeatureSequenceWithBigrams rather than a FeatureVector.",
			null);

	static CommandOption.Boolean saveTextInSource = new CommandOption.Boolean(
			EnronEmail2Vectors.class, "save-text-in-source", "[TRUE|FALSE]",
			false, false, "If true, save original text of document in source.",
			null);

	static CommandOption.ObjectFromBean stringPipe = new CommandOption.ObjectFromBean(
			EnronEmail2Vectors.class,
			"string-pipe",
			"Pipe constructor",
			true,
			null,
			"Java code for the constructor of a Pipe to be run as soon as input becomes a CharSequence",
			null);

	static CommandOption.ObjectFromBean tokenPipe = new CommandOption.ObjectFromBean(
			EnronEmail2Vectors.class,
			"token-pipe",
			"Pipe constructor",
			true,
			null,
			"Java code for the constructor of a Pipe to be run as soon as input becomes a TokenSequence",
			null);

	static CommandOption.ObjectFromBean featureVectorPipe = new CommandOption.ObjectFromBean(
			EnronEmail2Vectors.class,
			"fv-pipe",
			"Pipe constructor",
			true,
			null,
			"Java code for the constructor of a Pipe to be run as soon as input becomes a FeatureVector",
			null);

	static CommandOption.String encoding = new CommandOption.String(
			EnronEmail2Vectors.class, "encoding", "STRING", true, Charset
					.defaultCharset().displayName(),
			"Character encoding for input file", null);

	static CommandOption.String tokenRegex = new CommandOption.String(
			EnronEmail2Vectors.class,
			"token-regex",
			"REGEX",
			true,
			CharSequenceLexer.LEX_ALPHA.toString(),
			"Regular expression used for tokenization.\n"
					+ "   Example: \"[\\p{L}\\p{N}_]+|[\\p{P}]+\" (unicode letters, numbers and underscore OR all punctuation) ",
			null);

	static CommandOption.Boolean printOutput = new CommandOption.Boolean(
			EnronEmail2Vectors.class,
			"print-output",
			"[TRUE|FALSE]",
			false,
			false,
			"If true, print a representation of the processed data\n"
					+ "   to standard output. This option is intended for debugging.",
			null);

	public static void main(String[] args) throws FileNotFoundException,
			IOException {
		// Process the command-line options
		CommandOption
				.setSummary(
						EnronEmail2Vectors.class,
						"A tool for creating instance lists of FeatureVectors or FeatureSequences from text documents.\n");
		CommandOption.process(EnronEmail2Vectors.class, args);

		// Print some helpful messages for error cases
		if (args.length == 0) {
			CommandOption.getList(EnronEmail2Vectors.class).printUsage(false);
			System.exit(-1);
		}
		if (classDirs.value.length == 0) {
			throw new IllegalArgumentException(
					"You must include --input DIR1 DIR2 ...' in order to specify a "
							+ "list of directories containing the documents for each class.");
		}

		// Remove common prefix from all the input class directories
		int commonPrefixIndex = Strings.commonPrefixIndex(classDirs.value);

		logger.info("Labels = ");
		File[] directories = new File[classDirs.value.length];
		for (int i = 0; i < classDirs.value.length; i++) {
			directories[i] = new File(classDirs.value[i]);
			if (commonPrefixIndex < classDirs.value.length) {
				logger.info("   "
						+ classDirs.value[i].substring(commonPrefixIndex));
			} else {
				logger.info("   " + classDirs.value[i]);
			}
		}

		Pipe instancePipe;
		InstanceList previousInstanceList = null;

		if (usePipeFromVectorsFile.wasInvoked()) {
			previousInstanceList = InstanceList
					.load(usePipeFromVectorsFile.value);
			instancePipe = previousInstanceList.getPipe();

		} else {

			// Build a new pipe

			// Create a list of pipes that will be added to a SerialPipes object
			// later
			ArrayList<Pipe> pipeList = new ArrayList<Pipe>();

			// Convert the "target" object into a numeric index
			// into a LabelAlphabet.
			pipeList.add(new Target2Label());

			// The "data" field is currently a filename. Save it as "source".
			pipeList.add(new SaveDataInSource());

			if (enronDataset.value) {
				// Special handling Enron dataset.
				pipeList.add(new EnronEmail2CharSequence());
			} else {
				// // Set "data" to the file's contents. "data" is now a String.
				pipeList.add(new Input2CharSequence(encoding.value));
			}

			// Optionally save the text to "source" -- not recommended if
			// memory is scarce.
			if (saveTextInSource.wasInvoked()) {
				pipeList.add(new SaveDataInSource());
			}

			// Allow the user to specify an arbitrary Pipe object
			// that operates on Strings
			if (stringPipe.wasInvoked()) {
				pipeList.add((Pipe) stringPipe.value);
			}

			// Remove all content before the first empty line.
			// Useful for email and usenet news posts.
			if (skipHeader.value) {
				pipeList.add(new CharSubsequence(CharSubsequence.SKIP_HEADER));
			}

			// Remove HTML tags. Suitable for SGML and XML.
			if (skipHtml.value) {
				pipeList.add(new CharSequenceRemoveHTML());
			}

			// Tokenize the input: first compile the tokenization pattern
			Pattern tokenPattern = null;
			if (keepSequenceBigrams.value) {
				// We do not want to record bigrams across punctuation,
				// so we need to keep non-word tokens.
				tokenPattern = CharSequenceLexer.LEX_NONWHITESPACE_CLASSES;
			} else {
				// Otherwise, try to compile the regular expression pattern.
				try {
					tokenPattern = Pattern.compile(tokenRegex.value);
				} catch (PatternSyntaxException pse) {
					throw new IllegalArgumentException(
							"The token regular expression (" + tokenRegex.value
									+ ") was invalid: " + pse.getMessage());
				}
			}

			// Add the tokenizer
			pipeList.add(new CharSequence2TokenSequence(tokenPattern));

			if (stemming.value) {
				pipeList.add(new TokenSequenceStemmer());
			}

			// Allow user to specify an arbitrary Pipe object
			// that operates on TokenSequence objects.
			if (tokenPipe.wasInvoked()) {
				pipeList.add((Pipe) tokenPipe.value);
			}

			if (!preserveCase.value()) {
				pipeList.add(new TokenSequenceLowercase());
			}

			if (keepSequenceBigrams.value) {
				// Remove non-word tokens, but record the fact that they
				// were there.
				pipeList.add(new TokenSequenceRemoveNonAlpha(true));
			}

			// Stopword removal.
			if (stoplistFile.wasInvoked()) {
				// The user specified a new list
				TokenSequenceRemoveStopwords stopwordFilter = new TokenSequenceRemoveStopwords(
						stoplistFile.value, encoding.value, false, // don't
																	// include
																	// default
																	// list
						false, keepSequenceBigrams.value);

				if (extraStopwordsFile.wasInvoked()) {
					stopwordFilter.addStopWords(extraStopwordsFile.value);
				}
				pipeList.add(stopwordFilter);

			} else if (removeStopWords.value) {
				// The user did not specify a new list, so use the default
				// built-in English list, possibly adding extra words.
				TokenSequenceRemoveStopwords stopwordFilter = new TokenSequenceRemoveStopwords(
						false, keepSequenceBigrams.value);

				if (extraStopwordsFile.wasInvoked()) {
					stopwordFilter.addStopWords(extraStopwordsFile.value);
				}

				pipeList.add(stopwordFilter);
			}

			// gramSizes is an integer array, with default value [1].
			// Check if we have a non-default value.
			if (!(gramSizes.value.length == 1 && gramSizes.value[0] == 1)) {
				pipeList.add(new TokenSequenceNGrams(gramSizes.value));
			}

			// So far we have a sequence of Token objects that contain
			// String values. Look these up in an alphabet and store integer IDs
			// ("features") instead of Strings.
			if (keepSequenceBigrams.value) {
				pipeList.add(new TokenSequence2FeatureSequenceWithBigrams());
			} else {
				pipeList.add(new TokenSequence2FeatureSequence());
			}

			// For many applications, we do not need to preserve the sequence of
			// features,
			// only the number of times times a feature occurs.
			if (!(keepSequence.value || keepSequenceBigrams.value)) {
				pipeList.add(new FeatureSequence2AugmentableFeatureVector(
						binaryFeatures.value));
			}

			// Allow users to specify an arbitrary Pipe object that operates on
			// feature vectors.
			if (featureVectorPipe.wasInvoked()) {
				pipeList.add((Pipe) featureVectorPipe.value);
			}

			// Convert the "target" object into a numeric index
			// into a LabelAlphabet.
			// pipeList.add(new Target2Label());

			if (printOutput.value) {
				pipeList.add(new PrintInputAndTarget());
			}

			instancePipe = new SerialPipes(pipeList);

		}

		InstanceList instances = new InstanceList(instancePipe);

		boolean removeCommonPrefix = true;
		instances.addThruPipe(new FileIterator(directories,
				FileIterator.STARTING_DIRECTORIES, removeCommonPrefix));

		// write vector file
		ObjectOutputStream oos;
		if (outputFile.value.toString().equals("-")) {
			oos = new ObjectOutputStream(System.out);
		} else {
			oos = new ObjectOutputStream(new FileOutputStream(outputFile.value));
		}
		oos.writeObject(instances);
		oos.close();

		// *rewrite* vector file used as source of pipe in case we changed the
		// alphabet(!)
		if (usePipeFromVectorsFile.wasInvoked()) {
			logger.info(" rewriting previous instance list, with ID = "
					+ previousInstanceList.getPipe().getInstanceId());
			oos = new ObjectOutputStream(new FileOutputStream(
					usePipeFromVectorsFile.value));
			oos.writeObject(previousInstanceList);
			oos.close();
		}

	}

}
